home *** CD-ROM | disk | FTP | other *** search
- page 55,132
- title TK - Token Parsing filter
- ;
- ; TK --- A Simple Token Parsing Filter for DOS 2.0
- ;
- ; (c) Copyright 1984 by Jim Mott
- ; 3710 Slopeview Drive
- ; Sunnyvale, CA 95148
- ; (408) 274-2620
- ; All rights reserved. Permission granted to use this software for
- ; personal, noncommercial purposes only.
- ;
- ;
- ; This program is designed to be a filter for DOS 2.0.
- ; It will tokenize its input and allow subsetting and/or
- ; single token per line output.
- ;
- ; The format of the command is:
- ;
- ; TK {/RJx | /LJx} {/0} {{/v} | {/v/v}}
- ; where /RJx means right justify all tokens to x positions
- ; /LJx means left justify all tokens to x positions
- ; In the two entries x must be in [1..15]
- ; /0 means output one token per line
- ; /v means select token v for output. You may select any
- ; number, up to 255, of tokens to output. Repeats are
- ; allowed and you may change the order of the input tokens
- ; on the output line.
- ;
- ; For example, to extract the list of users from a VM directory file and
- ; write a sorted list of them without passwords to the printer the
- ; following command line would be used.
- ;
- ; FIND "USER " < DIRECT.VM | TK/LJ8/2/4/5/6/7/8/9 | SORT > PRN
- ;
- ;
- ; For example, to find a list of all sub-directories of the current
- ; directory sorted by sub-directory name we would use the following
- ; command line:
- ;
- ; DIR | FIND "<DIR>" | TK/LJ8/1/3/4 | SORT | MORE
- ;
- ;
- ; For example, to generate a sorted list of all words used in a document
- ; with one word per line we could use the following command line:
- ;
- ; TK/RJ8/0 < FOOBAR.DOC | SORT | MORE
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- ;
- stack segment para stack 'STACK'
- db 8 dup('Jim Mott (408) 274-2620')
- stack ends
- ;
- ;
- dsect segment para 'DATA'
- buffer db 255 dup('?') ; where to put the data
- db ' ' ; be sure to end scan correctly
- ;
- glen dw 0 ; length of gbuff
- gbptr dw gbuff ; point to start of buffer
- gbuff db 255 dup('G') ; buffer used by bufget
- ;
- flag1 db ? ; options enabled
- f1rj equ 01h ; right justify tokens
- f1lj equ 02h ; left justify tokens
- f1one equ 04h ; output one token per line
- f1sub equ 08h ; substring function requested
- f1work equ 10h ; fill trailing spaces
- f1oerr equ 20h ; error in options string
- f1eof equ 40h ; end of file on standard input device
- f1qeof equ 80h ; queue the end of file
- ;
- spaces db ? ; number of trailing spaces required
- ;
- toksiz db ? ; token size if (f1rj or f1lj)
- ;
- three db 3 ; length of each entry
- tokcnt db ? ; count of tokens in table
- toktbl db 3*255 dup('0') ; table of token pointers and lengths
- ;
- outcnt db ? ; number of subsetting entries in
- ; outers
- outers db 255 dup('1') ; list of token numbers to output
- ;
- tokptr dw ? ; pointer to free token space
- tokens db 900 dup('2') ; string space of tokens
- ;
- msgver db 'TK: Incorrect DOS version. Must be at least 2.00.'
- db 0dh,0ah,'$'
- optmsg db 'TK: Incorrect parameters given.'
- db 0dh,0ah
- optmgl equ $ - optmsg
- noroom db 'TK: No room for user on device.'
- db 0dh,0ah
- lnoroom equ $ - noroom
- chrspa db ' ' ; a space to output
- chrclf db 0dh,0ah ; <cr><lf> sequence
- dsect ends
- ;
- ;
- csect segment para 'CODE'
- assume cs:csect,ds:dsect,ss:stack
- ;
- main proc far
- ;
- push ds ; set up a return address
- sub ax,ax ; we want to return to DS:0000
- push ax
- mov ax,dsect ; point to start of data area
- mov ds,ax ; make assume and reality agree
- mov ah,30h ; get DOS version number
- int 21h ; call OS to get it
- cmp al,2 ; is it at least 2.00?
- jnl main00 ; yesy - good enough
- lea dx,msgver ; no - point to the "Bad DOS version"
- mov ah,9 ; message and use DOS 1.?? function
- int 21h ; call to print it.
- ret ; and do a long return
- ;
- main00: call options ; parse the options (at ES:80) and set
- ; flags
- mov ax,ds ; make ES and DS the same now
- mov es,ax ; so the string moves work nicely.
- test flag1,f1oerr ; was there and error in the options
- jz main01 ; no - then go with this baby
- lea dx,optmsg ; yes - point to options error message
- mov cx,optmgl ; get length of message
- mov bx,2 ; error output device handle
- mov ah,40h ; set DOS function number for
- int 21h ; "write to file or device" & call DOS
- jmp short main03 ; and return as done
- ;
- main01: mov tokcnt,0 ; no tokens in the table
- lea ax,tokens ; point to start of token work area
- mov tokptr,ax ; save pointer to next free byte
- call bufget ; read in a buffer
- test flag1,f1eof ; is there any data in the read buffer
- jnz main03 ; no - we are done with this pgm then
- dec cx ; yes - ignore the trailing <cr>
- jle main01 ; if length is =<0 just get next line
- lea bx,buffer ; point to the first byte of the data
- ;
- main02: call nextok ; get the next token
- or cx,cx ; are we done with this line yet
- jnz main02 ; no - get yet another token
- call write ; write the lines
- jmp short main01 ; and loop for the next line
- ;
- main03: call crlf ; write a final <cr><lf> sequence
- mov al,0 ; put 0 in al - return code to post
- mov ah,4ch ; terminate a process code
- int 21h ; end this program
- ;
- main endp
- ;
- ;
- ; OPTIONS - This subroutine will parse the options passed to the
- ; program and set the required bits in flag1. No registers
- ; are preserved since we are called only once, before the
- ; program has really started.
- ;
- options proc near
- ;
- mov outcnt,0 ; initialize outers count
- mov si,81h ; point to the first parms character
- ;
- opt01: mov al,byte ptr es:0[si] ; get a byte from the parm string
- inc si ; point to the next byte
- cmp al,0dh ; is it the end of the string?
- jne opt02 ; no - goody, more data to process
- ret ; yes, return to the caller then
- ;
- opt02: cmp al,' ' ; allow spaces anywhere before slashes
- je opt01 ; ignore them though
- cmp al,'/' ; we have to start with a slash now
- je opt04 ; if it is a slash then process it
- ;
- opterr: or flag1,f1oerr ; otherwise set the options error flag
- ret ; and return
- ;
- opt04: mov al,byte ptr es:0[si] ; get the next character after slash
- inc si ; point to next character in parms
- cmp al,'a' ; is it lower case or funny?
- jl opt4a ; no - process it normally then
- sub al,'a'-'A' ; yes - map lower case to upper
- ;
- opt4a: cmp al,'L' ; might it be left justify or numeric
- jl optnum ; perhaps numeric - check it out
- jne opt05 ; it is not LJ for sure
- or flag1,f1lj ; assume it is LJ for the moment
- test flag1,f1rj ; make sure this isn't a duplicate
- jnz opterr ; if RJ already then big problems
- jmp short opt06 ; and rejoin common justify code
- ;
- opt05: cmp al,'R' ; might it be right justify (RJ)?
- jne opterr ; no - then it is an error
- or flag1,f1rj ; yes - assume for the moment it is
- test flag1,f1lj ; make sure we aren't trying to left
- jnz opterr ; justify too - if we are we are in
- ; deep s..t
- opt06: mov al,byte ptr es:0[si] ; get the next character
- inc si ; point to the next character in parms
- cmp al,'J' ; is it the J we expect?
- je opt6a ; yes - process it normally then
- cmp al,'j' ; is it a lower case J
- jne opterr ; no - that's too bad.
- ;
- opt6a: mov al,byte ptr es:0[si] ; get the first byte of the number
- inc si ; point to next character in parms
- call decbin ; is it a number?
- jc opterr ; no - then we have an error
- or al,al ; is the field size 0?
- je opterr ; yes - it is in error then
- cmp al,15 ; is field size more than 15?
- jg opterr ; yes - it is in error then
- mov toksiz,al ; save the justified field size
- jmp short opt01 ; and process further options
- ;
- optnum: call decbin ; is it a number after slash?
- jc opterr ; no - then it is an error
- or al,al ; zero is special
- jne opt08 ; not zero - save it in array then
- or flag1,f1one ; zero means one token per line
- jmp short opt01 ; process some other token then
- ;
- opt08: sub cx,cx ; get a zeroed double register
- mov cl,outcnt ; get offset into outers for this guy
- lea bx,outers ; point just before list of outers
- add bx,cx ; bx points to origin 1 save spot
- mov byte ptr [bx],al ; save the token position to write
- inc outcnt ; add one to outcnt
- or flag1,f1sub ; make sure substitute flag is on
- jmp opt01 ; and play it again Sam.
- ;
- options endp
- ;
- ;
- ; nextok - This subroutine will find the next token in the string
- ; pointed to by bx, with length contained in cx, and move
- ; it to the end of the token space. An entry in toktbl will
- ; be created for this token. When the subroutine returns cx
- ; will be zero if the source data string is empty. bx will
- ; point to the first character past the last token.
- ;
- nextok proc near
- ;
- mov di,tokptr ; get pointer to where to put token
- ;
- next01: mov al,byte ptr 0[bx] ; loop past junk
- cmp al,' ' ; is it a leading space?
- jne next03 ; no - then we have a token
- inc bx ; yes - point to the next character
- loop next01 ; and try that one
- ret ; return if we are done with output
- ;
- next03: mov si,bx ; save pointer to start of token
- mov ah,1 ; initial guess for token length is 1
- ;
- next04: inc bx ; point to the next character in input
- mov al,byte ptr [bx] ; get the character
- cmp al,' ' ; is it the end of the token?
- je next05 ; yes - we have some good numbers
- inc ah ; no - increment count of contiguous
- ; characters.
- loop next04 ; continue till out of chars or a
- ; field separator
- dec ah ; shouldn't get here but correct for
- ; it anyway
- next05: push cx ; save number of chars left in source
- ; string
- test flag1,f1rj + f1lj ; do we have a maximum token length?
- jz next09 ; no - just a normal token write then
- cmp ah,toksiz ; yes - is this token just right?
- je next09 ; it sure is. we will keep it as is
- jl next06 ; if token size < max token size - pad
- mov ah,toksiz ; otherwise take max token size as own
- jmp short next09 ; and continue normally
- ;
- next06: mov al,toksiz ; get the toekn size we must pad to
- sub al,ah ; al contains number of spaces needed
- test flag1,f1lj ; left justify? (pad right with space)
- jz next07 ; no - must pad to the left with space
- mov spaces,al ; yes - save how many spaces to fill
- or flag1,f1work ; mark as work to do later on
- jmp short next09 ; and join mainline code
- ;
- next07: mov cl,al ; cx contains number of leading spaces
- ;
- next08: mov byte ptr [di],' ' ; put a leading space in this token
- inc di ; point to the next slot
- loop next08 ; and fill in all needed spaces
- ;
- next09: mov cl,ah ; cx now contains total number of
- ; chars in token
- cld ; make the direction ever upward
- rep movsb ; move the token to its spot
- test flag1,f1work ; is it left justified (need spaces)
- jz next11 ; no - we are done with hard part then
- mov cl,spaces ; get count of spaces needed
- ;
- next10: mov byte ptr [di],' ' ; move in a trailing space
- inc di ; point to the next slot and
- loop next10 ; cont. till all trailing spaces done
- and flag1,255-f1work ; reset the work to do bit
- ;
- next11: mov dx,tokptr ; get pointer to start of this token
- mov tokptr,di ; save pointer to next free token byte
- test flag1,f1lj + f1rj ; do we have fixed length tokens?
- jz next12 ; no - take them as we get them
- mov ah,toksiz ; yes - set this tokens length
- ;
- next12: mov cl,ah ; save length of token
- mov al,3 ; number of bytes per entry
- mul tokcnt ; ax is now an offset in toktbl
- lea si,toktbl ; point to start of token table
- add si,ax ; si points to an entry in toktbl
- mov byte ptr [si],cl ; move in length of entry
- mov word ptr 1[si],dx ; save pointer to start of token
- inc tokcnt ; count one more token
- pop cx ; cx contains number of source chars
- or cx,cx ; left. Are we done yet?
- jz next13 ; yes - return
- dec cx ; no - correct for undercounting by 1
- ;
- next13: ret ; and return
- ;
- nextok endp
- ;
- ;
- ; write - This routine will write the tokens to the standard output
- ; device. It is controlled by the settings of flags in flag1.
- ;
- write proc near
- ;
- sub cx,cx ; get an empty loop counter
- mov cl,tokcnt ; cl contains total number tokens read
- or cx,cx ; do we have anything to write out?
- jnz write1 ; yes - then go for it
- ret ; no - we are done before we begin
- ;
- write1: test flag1,f1sub ; are we changing their order?
- jnz write3 ; yes - then use different write logic
- sub dl,dl ; no - just output them all in order
- ;
- write2: call tout ; write the sucker
- inc dl ; point to the next token
- loop write2 ; and go through them all
- jmp short write6 ; return. A job well done
- ;
- write3: mov cl,outcnt ; get the number tokens to write
- lea bx,outers ; point to the first one to output
- ;
- write4: mov dl,byte ptr [bx] ; get a token to write
- cmp dl,tokcnt ; is it <= max token?
- jg write5 ; no - don't write it then
- dec dl ; yes - adjust for origin one and
- call tout ; write this token then
- ;
- write5: inc bx ; point to the next token count to
- loop write4 ; write and loop through whole list
- ;
- write6: test flag1,f1one ; are we outputting one token/line?
- jnz write7 ; yes - the last <cr><lf> was written
- call crlf ; no - write a trailing <cr><lf>
- ;
- write7: ret ; Done. Go home now.
- ;
- write endp
- ;
- ;
- ; tout - This routine will find and write the token from the input
- ; line that is in position dl on that line.
- ;
- tout proc near
- ;
- push bx ; save the registers
- push cx
- push dx
- mov al,3 ; number of bytes per toktbl entry
- mul dl ; get offset into toktbl for token
- lea bx,toktbl ; point to the start of the table
- add bx,ax ; point to the correct 3 byte entry
- sub cx,cx ; zero the counter
- mov cl,byte ptr [bx] ; get number of chars in this token
- mov dx,word ptr 1[bx] ; and point to first byte of token
- call oswrite ; write to standard output device
- test flag1,f1one ; only one token per line?
- jz tout02 ; no - write a space then
- call crlf ; yes - write a <cr><lf> sequence
- jmp short tout03 ; and return
- ;
- tout02: mov cx,1 ; length of space is one
- lea dx,chrspa ; point to a space
- call oswrite ; write to standard output device
- ;
- tout03: pop dx ; restore the registers
- pop cx
- pop bx
- ret ; and return
- ;
- tout endp
- ;
- ;
- ; crlf - Everybody knows what this routine does.
- ;
- crlf proc near
- ;
- push ax ; save the registers
- push bx
- push cx
- push dx
- mov cx,2 ; length of <cr><lf> string
- lea dx,chrclf ; point to the data to write
- call oswrite ; write to the standard output device
- pop dx ; restore the registers
- pop cx
- pop bx
- pop ax
- ret ; and return
- ;
- crlf endp
- ;
- ;
- ; oswrite - This routine will write characters pointed to by ds:dx
- ; of length contained in cx, to the standard output device
- ; I any errors are detected a message will be written to the
- ; standard error device and flag f1eof will be set.
- ;
- oswrite proc near
- ;
- mov bx,1 ; file handle of standard output
- mov ah,40h ; write to file or device DOS function
- int 21h ; call DOS
- jc oswr01 ; if error 5 or 6 then end
- cmp cx,ax ; as many chars as we wanted written?
- je oswr99 ; return if all went well
- ;
- oswr01: lea dx,noroom ; point to the "no space" message
- mov cx,lnoroom ; get the length of the message
- mov bx,2 ; get handle for standard error device
- mov ah,40h ; write to file or device DOS function
- int 21h ; let him know we erred
- or flag1,f1eof ; pretend eof on input device so
- ;
- oswr99: ret ; program stops and return
- ;
- oswrite endp
- ;
- ;
- ; decbin - On entry this routine has the first character to convert
- ; to binary in al. si points to additional characters. On exit
- ; si points to the first non-numeric character found.
- ; al contains the binary value and carry isn't set. If carry
- ; is set on return then an invalid number was found.
- ;
- decbin proc near
- ;
- push bx ; save a register
- call decb04 ; check for numeric in al
- jnc decb02 ; if al was numeric it is now 0 .. 9
- ;
- decb01: stc ; make sure carry flag set
- pop bx ; restore the register
- ret ; and return indicating an error
- ;
- decb02: mov bl,al ; get total so far
- mov al,byte ptr es:0[si] ; get a byte from the input stream
- inc si
- call decb04 ; check it for numeric
- jnc decb03 ; if numeric then juggle some
- mov al,bl ; otherwise get the value to return
- clc ; clear carry flag to say it worked
- dec si ; make sure next char is non-numeric
- pop bx ; restore the register
- ret ; and return
- ;
- decb03: mov bh,al ; save the number for a minute
- mov al,10 ; get the base
- mul bl ; shift left one position (base al)
- mov bl,bh ; make bx a good number
- sub bh,bh ; bx now contains16 bit value of digit
- add ax,bx ; add in the latest digit
- or ah,ah ; make sure no overflow
- jne decb01 ; if there was this is an error
- jmp short decb02 ; continue on our way
- ;
- decb04: sub al,'0' ; is it less than a number?
- jl decb05 ; yes - return with carry set
- cmp al,9 ; is it more than a number?
- jg decb05 ; yes - return with carry set
- clc ; no - make sure carry is off
- ret ; then return the number
- ;
- decb05: stc ; set carry on
- ret ; and return
- ;
- decbin endp
- ;
- ;
- ; bufget - This routine will read one 'line' from the standard input
- ; device to buffer. On exit cx contains the count of chars
- ; read. f1eof is set if an end of file condition is
- ; encountered.
- ;
- bufget proc near
- ;
- push ax ; save the registers
- push bx
- push dx
- push di
- push si
- test flag1,f1qeof ; should we reflect an immediate eof?
- jz bufg00 ; no - standard logic here then
- or flag1,f1eof ; yes - set the end of file bit
- and flag1,255 - f1qeof ; and say it is no longer pending
- jmp short bufret ; return now
- ;
- bufg00: sub cx,cx ; count of characters gotten
- lea di,buffer ; point destination to buffer
- ;
- bufg01: call cget ; get one character
- test flag1,f1eof ; did we get an eof on that try?
- jz bufg02 ; yes - let's hope it is an error
- or cl,cl ; is there anything in the buffer?
- jz bufret ; no - just return with cx=0 and f1eof
- mov ah,0dh ; yes - slap a <cr> on the end
- call cput ; put it at end of buffer
- and flag1,255 - f1eof ; clear the end of file bit
- or flag1,f1qeof ; say next time turn on eof for sure
- jmp short bufret ; and return this last buffer
- ;
- bufg02: cmp ah,0dh ; is the record terminator character?
- je bufg03 ; yes - don't turn that into a space
- cmp ah,20h ; no - if not <cr>
- jge bufg03 ; if >= 20h then use as is
- mov ah,' ' ; otherwise make it a space
- ;
- bufg03: call cput ; write char to output buffer
- cmp ah,0dh ; just write the record terminator?
- je bufret ; yes - then return
- cmp cl,255 ; written 255 characters yet?
- jne bufg01 ; no - get the next character
- mov byte ptr [di],0dh ; make it a terminator
- ;
- bufret: pop si ; restore the registers
- pop di
- pop dx
- pop bx
- pop ax
- ret ; and return
- ;
- bufget endp
- ;
- ;
- ;
- cget proc near
- ;
- mov dx,glen ; is there any data in gbuff?
- or dx,dx ; if count is zero there isn't
- jnz cget01 ; there is data so read it
- push cx ; save the registers we might need
- push di
- mov ah,3fh ; DOS function read from standard in
- mov bx,0 ; file handle for standard in
- mov cx,255 ; number of characters to read
- lea dx,gbuff ; point to where to put the data
- mov gbptr,dx ; save pointer to first character
- int 21h ; call DOS function
- mov glen,ax ; save the number of characters read
- mov dx,ax ; put data count in dx
- pop di ; restore the registers
- pop cx
- or dx,dx ; did we get data or eof?
- jnz cget01 ; data this time
- or flag1,f1eof ; set the end of file encountered bit
- ret ; and return
- ;
- cget01: mov bx,gbptr ; get pointer to character and return
- mov ah,byte ptr [bx]
- inc gbptr ; get character and increment pointer
- dec glen ; decrement length
- inc cl ; count this character
- ret ; and return the character
- ;
- cget endp
- ;
- ;
- ;
- ;
- cput proc near
- ;
- mov byte ptr [di],ah ; save the character
- inc di ; point to next spot
- cmp cl,255 ; will we overreach next time
- jne cput01 ; no - good thing
- dec di ; yes - can't let that happen
- ;
- cput01: ret ; all done
- ;
- cput endp
- ;
- csect ends
- end main